# coding=utf-8
"""
    @project: 15python_spider
    @Author：frank
    @file： 04_baidu_image_xpath.py
    @date：2024/1/16 20:30
"""
import os
import random
import re
import time

import requests
from lxml import etree

from day02.utils.UserAgent import user_agent_list


class BaiduImageSpider(object):
    def __init__(self):
        self.headers = {
            'User-Agent': random.choice(user_agent_list)
        }

    # 获取帖子链接
    def get_tlink(self, url):
        html = requests.get(url, headers=self.headers).content.decode('utf-8')
        # 提取帖子链接
        parse_html = etree.HTML(html)
        t_list = parse_html.xpath("//*[@class='threadlist_lz clearfix']/div/a/@href")
        for t in t_list:
            t_link = 'http://tieba.baidu.com' + t
            # 把一个帖子中所有图片保存到本地
            self.write_image(t_link)
            time.sleep(random.randint(1, 5))

    # 把一个帖子中所有图片保存到本地
    def write_image(self, t_link):
        html = requests.get(t_link, headers=self.headers).content.decode('utf-8')
        parse_html = etree.HTML(html)
        # 查找所有图片
        img_list = parse_html.xpath("//*[@class='BDE_Image']/@src")
        for img_link in img_list:
            html = requests.get(url=img_link, headers=self.headers).content
            #pattern = re.compile(r'http://tiebapic.baidu.com/(.+?)sign=.+?\/(.+?).(.+?)\?tbpicau=', re.S)
            pattern = re.compile(r'http://tiebapic.baidu.com/(.+?)/.*?sign=.+?/(.+)\.(.+?)\?tbpicau=', re.S)
            filenames = pattern.findall(img_link)
            filename = '%s/%s.%s' % (filenames[0][0], filenames[0][1], filenames[0][2])
            print(img_link)
            print(filename)
            if not os.path.exists(filenames[0][0]):
                os.makedirs(filenames[0][0])
            with open(filename, 'wb') as f:
                f.write(html)


if __name__ == '__main__':
    spider = BaiduImageSpider()
    spider.get_tlink('https://tieba.baidu.com/f?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&ie=utf-8&pn=50')
    # url = "http://tiebapic.baidu.com/forum/w%3D580/sign=67454094b003738dde4a0c2a831bb073/cafba319ebc4b7459ded882289fc1e178a821533.jpg?tbpicau=2024-01-24-05_67145842fff1c9e80184b884559dda6a"
    # pattern = re.compile(r'http://tiebapic.baidu.com/(.+?)/.*?sign=.+?/(.+)\.(.+?)\?tbpicau=', re.S)
    # filenames = pattern.findall(url)
    # print(filenames)
    # filename = '%s/%s/%s' % (filenames[0][0], filenames[0][1], filenames[0][2])
    # print(filename)
